rm(list=ls())
library(limma)
library(xlsx)
library(pheatmap)
library(xtable)
library(knitr)

1 Data QC and preprocessing for microarray data

ed = read.xlsx("ExpDesign.xls", 1, stringsAsFactors=F)
ed$sample_ID=as.character(ed$Sample.Number)
ed$ShortID = ifelse(ed$DS_TumorOrganoids, paste(ed$Condition, ed$Site, ed$patient), paste(ed$Condition, ed$Site, ed$patient, ed$Medium))
rownames(ed) = gsub(".txt","",ed$filename)

setwd("../../Data/Raw/")
agilent.datacolumns=list(E='gMedianSignal',Eb = 'gBGMedianSignal',isNonUniform='gIsFeatNonUnifOL', isPopOutlier='gIsFeatPopnOL');
intensities =read.maimages(paste(ed$filename,sep=""), source="agilent.median", green.only=TRUE, columns=agilent.datacolumns)
## Read US22502595_251485083363_S01_GE1_1105_Oct12_1_1.txt 
## Read US22502595_251485083363_S01_GE1_1105_Oct12_1_2.txt 
## Read US22502595_251485083363_S01_GE1_1105_Oct12_1_3.txt 
## Read US22502595_251485083363_S01_GE1_1105_Oct12_1_4.txt 
## Read US22502595_251485083374_S01_GE1_1105_Oct12_1_1.txt 
## Read US22502595_251485083374_S01_GE1_1105_Oct12_1_2.txt 
## Read US22502595_251485083374_S01_GE1_1105_Oct12_1_3.txt 
## Read US22502595_251485083374_S01_GE1_1105_Oct12_1_4.txt 
## Read US22502595_251485083362_S01_GE1_1105_Oct12_1_1.txt 
## Read US22502595_251485083362_S01_GE1_1105_Oct12_1_2.txt 
## Read US22502595_251485083362_S01_GE1_1105_Oct12_1_3.txt 
## Read US22502595_251485083362_S01_GE1_1105_Oct12_1_4.txt 
## Read US22502595_251485083359_S01_GE1_1105_Oct12_1_1.txt 
## Read US22502595_251485083359_S01_GE1_1105_Oct12_1_2.txt 
## Read US22502595_251485083359_S01_GE1_1105_Oct12_1_3.txt 
## Read US22502595_251485083359_S01_GE1_1105_Oct12_1_4.txt 
## Read US22502595_251485083377_S01_GE1_1105_Oct12_1_1.txt 
## Read US22502595_251485083377_S01_GE1_1105_Oct12_1_2.txt 
## Read US22502595_251485083377_S01_GE1_1105_Oct12_1_3.txt 
## Read US22502595_251485083377_S01_GE1_1105_Oct12_1_4.txt 
## Read US22502595_251485083378_S01_GE1_1105_Oct12_1_1.txt 
## Read US22502595_251485083378_S01_GE1_1105_Oct12_1_2.txt 
## Read US22502595_251485083378_S01_GE1_1105_Oct12_1_3.txt 
## Read US22502595_251485083378_S01_GE1_1105_Oct12_1_4.txt 
## Read US22502595_251485072266_S02_GE1_1105_Oct12_1_1.txt 
## Read US22502595_251485072266_S02_GE1_1105_Oct12_1_2.txt 
## Read US22502595_251485072266_S02_GE1_1105_Oct12_1_3.txt 
## Read US22502595_251485072266_S02_GE1_1105_Oct12_1_4.txt
setwd("../../Code/GeneExpressionMicroarray/")

# fix outdated chip annotations
new_anno_file = "../../Data/Raw/Agilent_14850_annotations_2017-02-08.Rdata"
load(new_anno_file)
old_anno = intensities$genes
take_over_cols = colnames(old_anno)[!colnames(old_anno) %in% c("GeneName","Description","SystematicName")]
tmp = old_anno[,take_over_cols]
tmp$index=1:nrow(tmp)
tmp = merge(tmp, anno_tab_14850, by.x="ProbeName", by.y="ProbeID", all.x=T, sort=F)
new_col_order = c(take_over_cols, colnames(tmp)[!colnames(tmp) %in% take_over_cols])
new_col_order = new_col_order[!new_col_order %in% c("GO_BP","GO_CC","GO_MF")]
new_anno = tmp[order(tmp$index),new_col_order]

intensities$genes = new_anno
intensities$probe_exclude=(intensities$isNonUniform>0)|(intensities$isPopOutlier>0)
intensities$E[intensities$probe_exclude] <- NA

1.1 Data overview

This document describes the preprocessing and QC of microarray data from project MB189 tumor/normal tissues and organoids (Mirjana Kessler, Karen Hoffmann) using a single-color hybridization. Micro arrays used had design Agilent 014850 (4x44k Whole Human Genome v1).

1.1.1 Samples

cols_to_exclude = c("filename", "Sample.Number")
ed_filtered = ed[,!colnames(ed) %in% cols_to_exclude]
rownames(ed_filtered) <- NULL
#print.xtable(xtable(ed_filtered,display=rep("s",ncol(ed)-length(cols_to_exclude)+1), align=paste(c(rep("|l",ncol(ed)-length(cols_to_exclude)+1),"|"), collapse="")), type="html", file="" , include.rownames=F)
kable(ed_filtered)
Chip.Barcode patient Condition Site Medium PublicationID DS_TumorOrganoids PublicationID_verbose sample_ID ShortID
251485083363_1_1 7 Tissue Tumor none OC7-T 1 Tissue Tumor OC7 1 Tissue Tumor 7
251485083363_1_2 7 Organoid Tumor M1 OC7-O 1 Organoid Tumor OC7 2 Organoid Tumor 7
251485083363_1_3 9 Tissue Tumor none OC9-T 1 Tissue Tumor OC9 3 Tissue Tumor 9
251485083363_1_4 9 Organoid Tumor M1 OC9-O 1 Organoid Tumor OC9 4 Organoid Tumor 9
251485083374_1_1 8 Tissue Tumor none OC8-T 1 Tissue Tumor OC8 5 Tissue Tumor 8
251485083374_1_2 8 Organoid Tumor M1 OC8-O 1 Organoid Tumor OC8 6 Organoid Tumor 8
251485083374_1_3 5 Organoid Tumor M1 OC5-O 1 Organoid Tumor OC5 7 Organoid Tumor 5
251485083374_1_4 5 Tissue Tumor none OC5-T 1 Tissue Tumor OC5 8 Tissue Tumor 5
251485083362_1_1 4 Tissue Tumor none OC4-T 1 Tissue Tumor OC4 9 Tissue Tumor 4
251485083362_1_2 4 Organoid Tumor M1 OC4-O 1 Organoid Tumor OC4 10 Organoid Tumor 4
251485083362_1_3 6 Tissue Tumor none OC6-T 1 Tissue Tumor OC6 11 Tissue Tumor 6
251485083362_1_4 6 Organoid Tumor M1 OC6-O 1 Organoid Tumor OC6 12 Organoid Tumor 6
251485083359_1_1 1 Organoid Normal StandardFT FT OC1-O 1 Organoid Normal OC1 13 Organoid Normal 1
251485083359_1_2 2 Organoid Normal StandardFT FT OC2-O 1 Organoid Normal OC2 14 Organoid Normal 2
251485083359_1_3 2 Organoid Tumor M1 OC2-O 1 Organoid Tumor OC2 15 Organoid Tumor 2
251485083359_1_4 1 Organoid Tumor M1 OC1-O 1 Organoid Tumor OC1 16 Organoid Tumor 1
251485083377_1_1 217 Tissue Normal none FT217-T 1 Tissue Normal FT217 17 Tissue Normal 217
251485083377_1_2 217 Organoid Normal StandardFT FT217-O 1 Organoid Normal FT217 18 Organoid Normal 217
251485083377_1_3 223 Tissue Normal none FT223-T 1 Tissue Normal FT223 19 Tissue Normal 223
251485083377_1_4 223 Organoid Normal StandardFT FT223-O 1 Organoid Normal FT223 20 Organoid Normal 223
251485083378_1_1 231 Tissue Normal none FT231-T 1 Tissue Normal FT231 21 Tissue Normal 231
251485083378_1_2 231 Organoid Normal StandardFT FT231-O 1 Organoid Normal FT231 22 Organoid Normal 231
251485083378_1_3 1 Tissue Tumor none OC1-T 1 Tissue Tumor OC1 23 Tissue Tumor 1
251485083378_1_4 2 Tissue Tumor none OC2-T 1 Tissue Tumor OC2 24 Tissue Tumor 2
251485072266_1_1 7 Organoid Tumor M1 OC7-O 0 Organoid Tumor OC7-M1 25 Organoid Tumor 7 M1
251485072266_1_2 7 Organoid Tumor M1_Wnt OC7-O 0 Organoid Tumor OC7-M1_Wnt 26 Organoid Tumor 7 M1_Wnt
251485072266_1_3 4 Organoid Tumor M1 OC4-O 0 Organoid Tumor OC4-M1 27 Organoid Tumor 4 M1
251485072266_1_4 4 Organoid Tumor M1_Wnt OC4-O 0 Organoid Tumor OC4-M1_Wnt 28 Organoid Tumor 4 M1_Wnt

1.2 Raw intensity data

1.2.1 Excluded probes (non-uniform feature or population outlier)

par(mar=c(12,4,4,2))
ex_cnt = apply(intensities$probe_exclude,2,sum)
barplot(ex_cnt, las=2, names.arg = ed[names(ex_cnt), "ShortID"], main = "number of excluded probes/sample")

The array Tissue Tumor 2 shows optical artefacts generating bias in a larger number of probes.

1.2.2 Intensity distribution across samples

par(mfrow=c(2,1), mar=c(11,4,4,1))
ii = log2(intensities$E)
colnames(ii) = ed[colnames(ii),]$ShortID
boxplot(ii, las=2, main = "Raw FG intensities", ylim=c(0,20))
ii = log2(intensities$Eb)
colnames(ii) = ed[colnames(ii),]$ShortID
boxplot(ii, las=2, main = "Raw BG intensities", ylim=c(0,20))
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 25 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 26 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 27 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 28 is not drawn

par(mfrow=c(1,1))
plotDensities(intensities, group = ed[colnames(intensities$E),"ShortID"], legend="topright", main="Intensity distribution by experiment run ID")
## Warning in plotDensities.EListRaw(intensities, group =
## ed[colnames(intensities$E), : NaNs produced

There are around 1500-2500 probes with background equal to zero in the last 4 arrays (M1 and M1_Wnt comparison) - not clear why this happened. Since we anyway will estimate the background from all probes using a statistical model below we will ignore this issue.

1.2.3 Correlation of raw intensities across samples

raw_cor = cor(log2(intensities$E),method="spearman", use="pairwise")
colnames(raw_cor) = ed[colnames(raw_cor),]$ShortID
#rownames(raw_cor) = ed[rownames(raw_cor),]$sample_ID
pheatmap(raw_cor, cluster_rows = F, cluster_cols=F)

1.3 Preprocessing

Data will be preprocessed as follows:

  • Background for each array will be corrected using the “normexp” method (Ritchie et al 2007, Bioinformatics, p. 2700-07) and an offset of 15.
  • Between-array normalization will be perfomed using the quantile method (Bolstad et al 2003, Bioinformatics, p. 185).

1.4 Background correction

bg_corrected = backgroundCorrect(intensities,method="normexp",offset=15)

1.4.1 Intensity distribution across samples after background correction

par(mar=c(12,4,4,1))
boxplot(log2(bg_corrected$E), las=2,  names = ed[colnames(bg_corrected$E), "ShortID"], main = "BG-corrected FG intensities", ylim=c(-2,20))

1.4.2 Normalization

normalized = normalizeBetweenArrays(bg_corrected,method="quantile")

1.4.2.1 Intensity distribution across samples after BG correction and normalization

boxplot(log2(normalized$E), las=2, names = ed[colnames(normalized$E), "ShortID"],main = "Normalized FG intensities", ylim=c(-2,20))

1.4.2.2 Correlation of normalized intensities across samples

norm_cor = cor(log2(normalized$E),method="spearman", use="pairwise")
colnames(norm_cor) = ed[colnames(norm_cor),]$sample_ID
rownames(norm_cor) = ed[rownames(norm_cor),]$ShortID
pheatmap(norm_cor)

1.5 Primary Component Analysis

norm_exp = normalized$E
NA_rows = apply(norm_exp,1,function(x) sum(is.na(x)))
pca = prcomp(t(norm_exp[NA_rows==0,]))
imp = summary(pca)$importance
cp = palette(rainbow(8))
plot(pca$x[,1], pca$x[,2], type="p", xlab=paste("1st PC (",round(imp[2,1],2)*100 ,"% var explained)",sep="" ), ylab=paste("2nd PC (",round(imp[2,2],2)*100 ,"% var explained)",sep="" ), main="PCA on normalized expression data", xlim=c(min(pca$x[,1])*1.5,max(pca$x[,1])*4))
text(pca$x[,1],pca$x[,2],labels=ed[colnames(normalized$E),]$ShortID, col=cp[as.numeric(as.factor(ed[colnames(normalized$E),]$Condition))], adj=-0.05)
abline(h=0)
abline(v=0)

1.6 Control probes

The following control probes exist on the arrays used in this experiment:

  • Corner associated (used for orientation purposes during scanning)
  • Bright corner
  • Dark corner
  • Negative controls
  • 3xSLv1 (hairpin probe that does not hybridize well with any possible RNA)
  • Positive controls
  • Human GAPDH and PGK1 probes
  • Deletion stringency probes (DCP, probe with varying number of insertions/changes with respect to reference; the number after the “_" denotes the number of differences to the reference which should correlate with lower expression)
  • E1A_r60: spike-in probes with concentrations that should cover the whole dynamic range of the array

There are a few other expression probes that are used by Agilent’s feature extraction/QC pipeline.

control_probes = which(intensities$genes$ControlType!=0)
cp_data = intensities$E[control_probes,]
cp_names = intensities$genes[control_probes,]
selected_controls = ifelse(substr(cp_names$ProbeName,1,4)=="ERCC",F,T)

# control probes
for (i in 1:ncol(cp_data)) {
  boxplot(log2(cp_data[selected_controls,i]) ~ factor(cp_names$ProbeName[selected_controls]),las=2, main=paste("Sample",i), outline=F)
}

1.7 Number of detected probes

Signal for probes will be considered detected if the expression of a probe is above the 95% quantile of expression in negative control probes. In a typical experiment one would expect about 50% of all genes to be detectable.

neg95 <- apply(normalized$E[normalized$genes$ControlType==-1,],2,function(x) quantile(x,p=0.95, na.rm=T))
cutoff <- matrix(1.1*neg95,nrow(normalized),ncol(normalized),byrow=TRUE)
isexpr <- rowSums(normalized$E > cutoff, na.rm=T) >= 2
# about 50% of genes should be detected
#table(isexpr)
isexpr_sample <- colSums(normalized$E > cutoff, na.rm=T)
sample_order = order(ed[names(isexpr_sample),]$ShortID)
isexpr_sample_ordered = isexpr_sample[sample_order]
nn = ed[names(isexpr_sample),]$ShortID[sample_order]
par(mar=c(15,4,4,2))
barplot(isexpr_sample_ordered, main="number of detected probes per sample", las=2, names.arg=nn)

par(mar=c(8,4,4,2))
save(ed, intensities, normalized, file="../../Data/Processed/Tumor_organoids_micro_array_preprocessed_data.Rdata")

2 Software versions

sessionInfo()
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
## 
## Matrix products: default
## BLAS: /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8          LC_NUMERIC=C                 
##  [3] LC_TIME=en_US.UTF-8           LC_COLLATE=en_US.UTF-8       
##  [5] LC_MONETARY=en_US.UTF-8       LC_MESSAGES=en_US.UTF-8      
##  [7] LC_PAPER=en_US.UTF-8          LC_NAME=en_US.UTF-8          
##  [9] LC_ADDRESS=en_US.UTF-8        LC_TELEPHONE=en_US.UTF-8     
## [11] LC_MEASUREMENT=en_US.UTF-8    LC_IDENTIFICATION=en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] knitr_1.20     xtable_1.8-2   pheatmap_1.0.8 xlsx_0.5.7    
## [5] xlsxjars_0.6.1 rJava_0.9-9    limma_3.34.9  
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.0         magrittr_1.5       munsell_0.4.3     
##  [4] colorspace_1.3-2   highr_0.6          stringr_1.3.0     
##  [7] plyr_1.8.4         tools_3.4.3        grid_3.4.3        
## [10] gtable_0.2.0       htmltools_0.3.6    yaml_2.1.16       
## [13] rprojroot_1.3-2    digest_0.6.15      RColorBrewer_1.1-2
## [16] evaluate_0.10.1    rmarkdown_1.8      stringi_1.1.6     
## [19] compiler_3.4.3     scales_0.5.0       backports_1.1.2